1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.lucene.uninverting;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.List;
26
27 import org.apache.lucene.codecs.PostingsFormat;
28 import org.apache.lucene.index.DocValues;
29 import org.apache.lucene.index.DocValuesType;
30 import org.apache.lucene.index.PostingsEnum;
31 import org.apache.lucene.index.FieldInfo;
32 import org.apache.lucene.index.LeafReader;
33 import org.apache.lucene.index.SortedSetDocValues;
34 import org.apache.lucene.index.Terms;
35 import org.apache.lucene.index.TermsEnum;
36 import org.apache.lucene.search.DocIdSetIterator;
37 import org.apache.lucene.util.Accountable;
38 import org.apache.lucene.util.Bits;
39 import org.apache.lucene.util.BytesRef;
40 import org.apache.lucene.util.PagedBytes;
41 import org.apache.lucene.util.StringHelper;
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113 public class DocTermOrds implements Accountable {
114
115
116
117 private final static int TNUM_OFFSET = 2;
118
119
120 public final static int DEFAULT_INDEX_INTERVAL_BITS = 7;
121
122 private int indexIntervalBits;
123 private int indexIntervalMask;
124 private int indexInterval;
125
126
127 protected final int maxTermDocFreq;
128
129
130 protected final String field;
131
132
133 protected int numTermsInField;
134
135
136 protected long termInstances;
137 private long memsz;
138
139
140 protected int total_time;
141
142
143 protected int phase1_time;
144
145
146 protected int[] index;
147
148
149 protected byte[][] tnums = new byte[256][];
150
151
152 protected long sizeOfIndexedStrings;
153
154
155 protected BytesRef[] indexedTermsArray = new BytesRef[0];
156
157
158
159 protected BytesRef prefix;
160
161
162
163
164 protected int ordBase;
165
166
167 protected PostingsEnum postingsEnum;
168
169
170 public long ramBytesUsed() {
171
172 if (memsz!=0) return memsz;
173 long sz = 8*8 + 32;
174 if (index != null) sz += index.length * 4;
175 if (tnums!=null) {
176 for (byte[] arr : tnums)
177 if (arr != null) sz += arr.length;
178 }
179 memsz = sz;
180 return sz;
181 }
182
183 @Override
184 public Collection<Accountable> getChildResources() {
185 return Collections.emptyList();
186 }
187
188
189 public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
190 this(reader, liveDocs, field, null, Integer.MAX_VALUE);
191 }
192
193
194
195
196 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix) throws IOException {
197 this(reader, liveDocs, field, termPrefix, Integer.MAX_VALUE);
198 }
199
200
201
202
203 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException {
204 this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS);
205 }
206
207
208
209
210
211 public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) throws IOException {
212 this(field, maxTermDocFreq, indexIntervalBits);
213 uninvert(reader, liveDocs, termPrefix);
214 }
215
216
217
218 protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) {
219
220 this.field = field;
221 this.maxTermDocFreq = maxTermDocFreq;
222 this.indexIntervalBits = indexIntervalBits;
223 indexIntervalMask = 0xffffffff >>> (32-indexIntervalBits);
224 indexInterval = 1 << indexIntervalBits;
225 }
226
227
228
229
230
231
232
233
234
235
236
237
238
239 public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException {
240
241
242
243 assert null != indexedTermsArray;
244
245 if (0 == indexedTermsArray.length) {
246 return null;
247 } else {
248 return new OrdWrappedTermsEnum(reader);
249 }
250 }
251
252
253
254
255 public int numTerms() {
256 return numTermsInField;
257 }
258
259
260
261
262 public boolean isEmpty() {
263 return index == null;
264 }
265
266
267 protected void visitTerm(TermsEnum te, int termNum) throws IOException {
268 }
269
270
271
272
273 protected void setActualDocFreq(int termNum, int df) throws IOException {
274 }
275
276
277 protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
278 final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
279 if (info != null && info.getDocValuesType() != DocValuesType.NONE) {
280 throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
281 }
282
283 final long startTime = System.currentTimeMillis();
284 prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);
285
286 final int maxDoc = reader.maxDoc();
287 final int[] index = new int[maxDoc];
288 final int[] lastTerm = new int[maxDoc];
289 final byte[][] bytes = new byte[maxDoc][];
290
291 final Terms terms = reader.terms(field);
292 if (terms == null) {
293
294 return;
295 }
296
297 final TermsEnum te = terms.iterator();
298 final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
299
300 if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
301
302 return;
303 }
304
305
306 final List<BytesRef> indexedTerms = new ArrayList<>();
307 final PagedBytes indexedTermsBytes = new PagedBytes(15);
308
309
310
311 byte[] tempArr = new byte[12];
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329 int termNum = 0;
330 postingsEnum = null;
331
332
333
334 for (;;) {
335 final BytesRef t = te.term();
336 if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
337 break;
338 }
339
340
341 visitTerm(te, termNum);
342
343 if ((termNum & indexIntervalMask) == 0) {
344
345 sizeOfIndexedStrings += t.length;
346 BytesRef indexedTerm = new BytesRef();
347 indexedTermsBytes.copy(t, indexedTerm);
348
349
350 indexedTerms.add(indexedTerm);
351 }
352
353 final int df = te.docFreq();
354 if (df <= maxTermDocFreq) {
355
356 postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);
357
358
359 int actualDF = 0;
360
361 for (;;) {
362 int doc = postingsEnum.nextDoc();
363 if (doc == DocIdSetIterator.NO_MORE_DOCS) {
364 break;
365 }
366
367
368 actualDF ++;
369 termInstances++;
370
371
372
373
374 int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
375 lastTerm[doc] = termNum;
376 int val = index[doc];
377
378 if ((val & 0xff)==1) {
379
380
381 int pos = val >>> 8;
382 int ilen = vIntSize(delta);
383 byte[] arr = bytes[doc];
384 int newend = pos+ilen;
385 if (newend > arr.length) {
386
387
388
389
390
391
392 int newLen = (newend + 3) & 0xfffffffc;
393 byte[] newarr = new byte[newLen];
394 System.arraycopy(arr, 0, newarr, 0, pos);
395 arr = newarr;
396 bytes[doc] = newarr;
397 }
398 pos = writeInt(delta, arr, pos);
399 index[doc] = (pos<<8) | 1;
400 } else {
401
402
403 int ipos;
404 if (val==0) {
405 ipos=0;
406 } else if ((val & 0x0000ff80)==0) {
407 ipos=1;
408 } else if ((val & 0x00ff8000)==0) {
409 ipos=2;
410 } else if ((val & 0xff800000)==0) {
411 ipos=3;
412 } else {
413 ipos=4;
414 }
415
416
417
418 int endPos = writeInt(delta, tempArr, ipos);
419
420 if (endPos <= 4) {
421
422
423 for (int j=ipos; j<endPos; j++) {
424 val |= (tempArr[j] & 0xff) << (j<<3);
425 }
426 index[doc] = val;
427 } else {
428
429 for (int j=0; j<ipos; j++) {
430 tempArr[j] = (byte)val;
431 val >>>=8;
432 }
433
434 index[doc] = (endPos<<8) | 1;
435 bytes[doc] = tempArr;
436 tempArr = new byte[12];
437 }
438 }
439 }
440 setActualDocFreq(termNum, actualDF);
441 }
442
443 termNum++;
444 if (te.next() == null) {
445 break;
446 }
447 }
448
449 numTermsInField = termNum;
450
451 long midPoint = System.currentTimeMillis();
452
453 if (termInstances == 0) {
454
455
456 tnums = null;
457 } else {
458
459 this.index = index;
460
461
462
463
464
465
466
467 for (int pass = 0; pass<256; pass++) {
468 byte[] target = tnums[pass];
469 int pos=0;
470 if (target != null) {
471 pos = target.length;
472 } else {
473 target = new byte[4096];
474 }
475
476
477
478
479 for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) {
480 int lim = Math.min(docbase + (1<<16), maxDoc);
481 for (int doc=docbase; doc<lim; doc++) {
482
483 int val = index[doc];
484 if ((val&0xff) == 1) {
485 int len = val >>> 8;
486
487 index[doc] = (pos<<8)|1;
488 if ((pos & 0xff000000) != 0) {
489
490 throw new IllegalStateException("Too many values for UnInvertedField faceting on field "+field);
491 }
492 byte[] arr = bytes[doc];
493
494
495
496
497
498 bytes[doc] = null;
499 if (target.length <= pos + len) {
500 int newlen = target.length;
501
502
503
504
505
506
507
508
509
510
511
512
513 while (newlen <= pos + len) newlen<<=1;
514 byte[] newtarget = new byte[newlen];
515 System.arraycopy(target, 0, newtarget, 0, pos);
516 target = newtarget;
517 }
518 System.arraycopy(arr, 0, target, pos, len);
519 pos += len + 1;
520 }
521 }
522 }
523
524
525 if (pos < target.length) {
526 byte[] newtarget = new byte[pos];
527 System.arraycopy(target, 0, newtarget, 0, pos);
528 target = newtarget;
529 }
530
531 tnums[pass] = target;
532
533 if ((pass << 16) > maxDoc)
534 break;
535 }
536
537 }
538 indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
539
540 long endTime = System.currentTimeMillis();
541
542 total_time = (int)(endTime-startTime);
543 phase1_time = (int)(midPoint-startTime);
544 }
545
546
547 private static int vIntSize(int x) {
548 if ((x & (0xffffffff << (7*1))) == 0 ) {
549 return 1;
550 }
551 if ((x & (0xffffffff << (7*2))) == 0 ) {
552 return 2;
553 }
554 if ((x & (0xffffffff << (7*3))) == 0 ) {
555 return 3;
556 }
557 if ((x & (0xffffffff << (7*4))) == 0 ) {
558 return 4;
559 }
560 return 5;
561 }
562
563
564
565 private static int writeInt(int x, byte[] arr, int pos) {
566 int a;
567 a = (x >>> (7*4));
568 if (a != 0) {
569 arr[pos++] = (byte)(a | 0x80);
570 }
571 a = (x >>> (7*3));
572 if (a != 0) {
573 arr[pos++] = (byte)(a | 0x80);
574 }
575 a = (x >>> (7*2));
576 if (a != 0) {
577 arr[pos++] = (byte)(a | 0x80);
578 }
579 a = (x >>> (7*1));
580 if (a != 0) {
581 arr[pos++] = (byte)(a | 0x80);
582 }
583 arr[pos++] = (byte)(x & 0x7f);
584 return pos;
585 }
586
587
588
589
590
591 private final class OrdWrappedTermsEnum extends TermsEnum {
592 private final TermsEnum termsEnum;
593 private BytesRef term;
594 private long ord = -indexInterval-1;
595
596 public OrdWrappedTermsEnum(LeafReader reader) throws IOException {
597 assert indexedTermsArray != null;
598 assert 0 != indexedTermsArray.length;
599 termsEnum = reader.fields().terms(field).iterator();
600 }
601
602 @Override
603 public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
604 return termsEnum.postings(reuse, flags);
605 }
606
607 @Override
608 public BytesRef term() {
609 return term;
610 }
611
612 @Override
613 public BytesRef next() throws IOException {
614 if (++ord < 0) {
615 ord = 0;
616 }
617 if (termsEnum.next() == null) {
618 term = null;
619 return null;
620 }
621 return setTerm();
622 }
623
624 @Override
625 public int docFreq() throws IOException {
626 return termsEnum.docFreq();
627 }
628
629 @Override
630 public long totalTermFreq() throws IOException {
631 return termsEnum.totalTermFreq();
632 }
633
634 @Override
635 public long ord() {
636 return ordBase + ord;
637 }
638
639 @Override
640 public SeekStatus seekCeil(BytesRef target) throws IOException {
641
642
643 if (term != null && term.equals(target)) {
644 return SeekStatus.FOUND;
645 }
646
647 int startIdx = Arrays.binarySearch(indexedTermsArray, target);
648
649 if (startIdx >= 0) {
650
651 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
652 assert seekStatus == TermsEnum.SeekStatus.FOUND;
653 ord = startIdx << indexIntervalBits;
654 setTerm();
655 assert term != null;
656 return SeekStatus.FOUND;
657 }
658
659
660 startIdx = -startIdx-1;
661
662 if (startIdx == 0) {
663
664 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
665 assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
666 ord = 0;
667 setTerm();
668 assert term != null;
669 return SeekStatus.NOT_FOUND;
670 }
671
672
673 startIdx--;
674
675 if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
676
677
678 } else {
679
680 TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
681 assert seekStatus == TermsEnum.SeekStatus.FOUND;
682 ord = startIdx << indexIntervalBits;
683 setTerm();
684 assert term != null;
685 }
686
687 while (term != null && term.compareTo(target) < 0) {
688 next();
689 }
690
691 if (term == null) {
692 return SeekStatus.END;
693 } else if (term.compareTo(target) == 0) {
694 return SeekStatus.FOUND;
695 } else {
696 return SeekStatus.NOT_FOUND;
697 }
698 }
699
700 @Override
701 public void seekExact(long targetOrd) throws IOException {
702 int delta = (int) (targetOrd - ordBase - ord);
703
704 if (delta < 0 || delta > indexInterval) {
705 final int idx = (int) (targetOrd >>> indexIntervalBits);
706 final BytesRef base = indexedTermsArray[idx];
707
708 ord = idx << indexIntervalBits;
709 delta = (int) (targetOrd - ord);
710 final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base);
711 assert seekStatus == TermsEnum.SeekStatus.FOUND;
712 } else {
713
714 }
715
716 while (--delta >= 0) {
717 BytesRef br = termsEnum.next();
718 if (br == null) {
719 assert false;
720 return;
721 }
722 ord++;
723 }
724
725 setTerm();
726 assert term != null;
727 }
728
729 private BytesRef setTerm() throws IOException {
730 term = termsEnum.term();
731
732 if (prefix != null && !StringHelper.startsWith(term, prefix)) {
733 term = null;
734 }
735 return term;
736 }
737 }
738
739
740
741 public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
742 termsEnum.seekExact(ord);
743 return termsEnum.term();
744 }
745
746
747 public SortedSetDocValues iterator(LeafReader reader) throws IOException {
748 if (isEmpty()) {
749 return DocValues.emptySortedSet();
750 } else {
751 return new Iterator(reader);
752 }
753 }
754
755 private class Iterator extends SortedSetDocValues {
756 final LeafReader reader;
757 final TermsEnum te;
758
759 final int buffer[] = new int[5];
760 int bufferUpto;
761 int bufferLength;
762
763 private int tnum;
764 private int upto;
765 private byte[] arr;
766
767 Iterator(LeafReader reader) throws IOException {
768 this.reader = reader;
769 this.te = termsEnum();
770 }
771
772 @Override
773 public long nextOrd() {
774 while (bufferUpto == bufferLength) {
775 if (bufferLength < buffer.length) {
776 return NO_MORE_ORDS;
777 } else {
778 bufferLength = read(buffer);
779 bufferUpto = 0;
780 }
781 }
782 return buffer[bufferUpto++];
783 }
784
785
786
787
788 int read(int[] buffer) {
789 int bufferUpto = 0;
790 if (arr == null) {
791
792
793 int code = upto;
794 int delta = 0;
795 for (;;) {
796 delta = (delta << 7) | (code & 0x7f);
797 if ((code & 0x80)==0) {
798 if (delta==0) break;
799 tnum += delta - TNUM_OFFSET;
800 buffer[bufferUpto++] = ordBase+tnum;
801
802 delta = 0;
803 }
804 code >>>= 8;
805 }
806 } else {
807
808 for(;;) {
809 int delta = 0;
810 for(;;) {
811 byte b = arr[upto++];
812 delta = (delta << 7) | (b & 0x7f);
813
814 if ((b & 0x80) == 0) break;
815 }
816
817 if (delta == 0) break;
818 tnum += delta - TNUM_OFFSET;
819
820 buffer[bufferUpto++] = ordBase+tnum;
821 if (bufferUpto == buffer.length) {
822 break;
823 }
824 }
825 }
826
827 return bufferUpto;
828 }
829
830 @Override
831 public void setDocument(int docID) {
832 tnum = 0;
833 final int code = index[docID];
834 if ((code & 0xff)==1) {
835
836 upto = code>>>8;
837
838 int whichArray = (docID >>> 16) & 0xff;
839 arr = tnums[whichArray];
840 } else {
841
842 arr = null;
843 upto = code;
844 }
845 bufferUpto = 0;
846 bufferLength = read(buffer);
847 }
848
849 @Override
850 public BytesRef lookupOrd(long ord) {
851 try {
852 return DocTermOrds.this.lookupTerm(te, (int) ord);
853 } catch (IOException e) {
854 throw new RuntimeException(e);
855 }
856 }
857
858 @Override
859 public long getValueCount() {
860 return numTerms();
861 }
862
863 @Override
864 public long lookupTerm(BytesRef key) {
865 try {
866 switch (te.seekCeil(key)) {
867 case FOUND:
868 assert te.ord() >= 0;
869 return te.ord();
870 case NOT_FOUND:
871 assert te.ord() >= 0;
872 return -te.ord()-1;
873 default:
874 return -numTerms()-1;
875 }
876 } catch (IOException e) {
877 throw new RuntimeException(e);
878 }
879 }
880
881 @Override
882 public TermsEnum termsEnum() {
883 try {
884 return getOrdTermsEnum(reader);
885 } catch (IOException e) {
886 throw new RuntimeException(e);
887 }
888 }
889 }
890 }